# y,x
cor.test(data$price,data$carat,method = "pearson")
# Kendal Tau (similarity)
# recall that Kendal Tau compares ordinal level data: IV and DV
# NULL = There is no relationship between the variables.
cor.test(data$clarity,data$cut,method = "kendal")
# Kendal Tau (similarity)
# recall that Kendal Tau compares ordinal level data: IV and DV
# NULL = There is no relationship between the variables.
cor.test(data$clarity,data$cut,method = "kendal")
qplot(x=clarity, data=data)
qplot(x=cut, data=data)
summary(data$clarity)
num.clarity<-as.numeric(data$clarity)
# tells R to make a variable that has a numeric value
# tells R to treat it like a continuous number
num.cut <-as.numeric(data$cut)
# Kendal Tau (similarity)
# recall that Kendal Tau compares ordinal level data: IV and DV
# NULL = There is no relationship between the variables.
cor.test(data$clarity,data$cut,method = "kendal")
cor.test(num.clarity, num.cut, method = "kendal")
# Kendal Tau (similarity)
# recall that Kendal Tau compares ordinal level data: IV and DV
# NULL = There is no relationship between the variables.
cor.test(data$clarity,data$cut,method = "kendal")
qplot(x=clarity, data=data)
qplot(x=cut, data=data)
summary(data$clarity)
num.clarity<-as.numeric(data$clarity)
# tells R to make a variable that has a numeric value
# tells R to treat it like a continuous number
num.cut <-as.numeric(data$cut)
cor.test(num.clarity, num.cut, method = "kendal")
cor.test(num.clarity, num.cut, method = "kendal")
# Chi square (similarity)
# Recall that chi square compares nominal level data: IV and DV
chisq.test(data$cut,data$clarity)
# Chi square (similarity)
# Recall that chi square compares nominal level data: IV and DV
chisq.test(data$cut,data$clarity)
# or...
mytable <-xtabs(~cut+clarity, data=data)
mytable
summary(mytable) #the summary command produces a chi-sq test of independence
# independent 2-group t-test
# t.test(y,x) # where both variables are numeric (i.e. interval)
t.test(data$price,data$carat)
# install.packages("openintro")
require(openintro)
data(gifted)
data(gifted)
min(gifted$count) # referencing variables
mean(gifted$count)
median(gifted$count)
sd(gifted$count)
max(gifted$count)
# OR:
summary(gifted$count) # missing standard deviation
sd(gifted$count)
# histogram
hist(gifted$count)
# make it look a little prettier
require(ggplot2)
qplot(x=count, data=gifted, xlab="Age in Months", ylab="Count", main="Distribution of Count")
hist(data$carat)
hist(data$price)
qplot(x=count, data=gifted, xlab="Age in Months", ylab="Count", main="Distribution of Count")
# make it look a little prettier
require(ggplot2)
qplot(x=count, data=gifted, xlab="Age in Months", ylab="Count", main="Distribution of Count")
# Z test
mean.gifted <- mean(gifted$count)
sd <- sd(gifted$count)
# z = obs - mean / standard error
se <- sd / sqrt(36)
z <- (mean.gifted - 32) / se
p <- pnorm(z) # left tail because Ha < 32
print(p)
# What if Ha was Ha: u > 32?
p2 <- 1-p
print(p2)
# t test
t <- (mean.gifted - 32) / se
print(t)
df <- 36 - 1
print(df)
upper <- mean.gifted + (1.65*se)
lower <- mean.gifted - (1.65*se)
ci <- c(lower, upper)
print(ci)
# summary statistics: diffiq
diff <- summary(diff.iq)
# produce a new variable gifted.iq
diff.iq <- gifted$motheriq - gifted$fatheriq
# summary statistics: diffiq
diff <- summary(diff.iq)
print(diff)
sd.diff <- sd(diff.iq)
print(sd.diff)
hist(diff.iq)
# conduct a paired t test between mother iq and father iq
t.test(gifted$motheriq, gifted$fatheriq, paired=TRUE)
# manually calculate a paired t-test
# H0: mu = 0
# Ha: mu does not equal 0
# T = average difference - observed difference / s.e. difference
# a = 0.05 (set by me)
sd <- sd(diff.iq)
se <- sd/sqrt(36)
T <- (3.4 - 0)/se
print(T)
df <- 36-1
print(df)
require(foreign)
require(readstata13)
require(ggplot2) # two, not z
require(lattice)
require(car)
require(dplyr)
###########################################
# Import data
###########################################
data <- diamonds %>% sample_n(1000)
summary(data)
# examine our data and variables of interest: size (carat) and price.
summary(data)
summary(data$price)
qplot(x=carat, data=data)
qplot(x=price, data=data)
# we use the distribution of the data to inform what type of test we use
hist(data$carat)
hist(data$price)
# Scatterplot
# The command for a scatterplot is plot(INDEPENDENT VARIABLE, DEPENDENT VARIABLE)
plot(data$carat,data$price)
# if we want to add labels
plot(data$carat,data$price,ylab="Price",
xlab="Carat",main="My Scatterplot!")
# Add a trend line
# OUTCOME ~ PREDICTOR, DATA = DATA NAME
abline(reg = lm(price~carat,data=data),lty=1,lwd=4,
col = "red")
# mean by groups
# summary statistics by group
# install.packages("psych")
library(psych)
describeBy(diamonds, diamonds$cut)
describeBy(diamonds$price, diamonds$cut)
# but you need to pay attention to the measurement of the variables
# independent 2-group t-test
# t.test(y~x) # where y is numeric and x is a binary factor
qplot(x=cut, data=diamonds)
# create two categories: fair and good - ideal.
# do fair diamonds have a different mean price then other cuts?
diamonds$cutcat <- as.factor(ifelse(diamonds$cut == "Fair", c("0"), c("1")))
# View(diamonds)
summary(diamonds$cutcat)
t.test(diamonds$price ~ diamonds$cutcat)
# OUTCOME ~ PREDICTOR, DATA = DATA NAME
abline(reg = lm(price~carat,data=data),lty=1,lwd=4,
col = "red")
price
# Import data
###########################################
data <- diamonds %>% sample_n(1000)
# Or
# data <- diamonds
# In class question: how is our data distributed?
summary(data)
# examine our data and variables of interest: size (carat) and price.
summary(data)
summary(data$price)
qplot(x=carat, data=data)
qplot(x=price, data=data)
# we use the distribution of the data to inform what type of test we use
hist(data$carat)
hist(data$price)
# if we want to add labels
plot(data$carat,data$price,ylab="Price",
xlab="Carat",main="My Scatterplot!")
# Add a trend line
# OUTCOME ~ PREDICTOR, DATA = DATA NAME
abline(reg = lm(price~carat,data=data),lty=1,lwd=4,
col = "red")
# Pearson Corellation (similarity)
# recall that Pearson tests are for comparing interval level data: IV and DV
# y,x
cor.test(data$price,data$carat,method = "pearson")
# Correlation: how 2 variables "move" together
# cor(Y,X)
cor(data$price,data$carat)
# Pearson Corellation (similarity)
# recall that Pearson tests are for comparing interval level data: IV and DV
# y,x
cor.test(data$price,data$carat,method = "pearson")
# create a sample of the diamonds data
sample <- diamonds %>% sample_n(1000)
mean(sample$price) # "sample" mean = 3969.098
mean(diamonds$price) # "population" mean = 3932.8
# how would we test?
se.sample <- sd(sample$price)/sqrt(1000)
z2 <- (1796.6 - 0)/se.sample
print(z2)
pnorm(z2) # area to the left
1-pnorm(z2) # area to the right
pnorm(z2, lower.tail = FALSE) # area to the right
require(foreign)
require(readstata13)
require(ggplot2) # two, not z
require(lattice)
require(car)
require(dplyr)
# clear the environment
rm(list=ls())
###########################################
# Import data
###########################################
data <- diamonds %>% sample_n(1000)
summary(data)
# examine our data and variables of interest: size (carat) and price.
summary(data)
summary(data$price)
qplot(x=carat, data=data)
qplot(x=price, data=data)
# we use the distribution of the data to inform what type of test we use
hist(data$carat)
hist(data$price)
# Scatterplot
# The command for a scatterplot is plot(INDEPENDENT VARIABLE, DEPENDENT VARIABLE)
plot(data$carat,data$price)
# if we want to add labels
plot(data$carat,data$price,ylab="Price",
xlab="Carat",main="My Scatterplot!")
# Add a trend line
# OUTCOME ~ PREDICTOR, DATA = DATA NAME
abline(reg = lm(price~carat,data=data),lty=1,lwd=4,
col = "red")
# Correlation: how 2 variables "move" together
# cor(Y,X)
cor(data$price,data$carat)
# Pearson Corellation (similarity)
# recall that Pearson tests are for comparing interval level data: IV and DV
# y,x
cor.test(data$price,data$carat,method = "pearson")
# Pearson Corellation (similarity)
# recall that Pearson tests are for comparing interval level data: IV and DV
# y,x
cor.test(data$price,data$carat,method = "pearson")
# Kendal Tau (similarity)
# recall that Kendal Tau compares ordinal level data: IV and DV
# NULL = There is no relationship between the variables.
cor.test(data$clarity,data$cut,method = "kendal")
qplot(x=clarity, data=data)
qplot(x=cut, data=data)
summary(data$clarity)
num.clarity<-as.numeric(data$clarity)
# tells R to make a variable that has a numeric value
# tells R to treat it like a continuous number
num.cut <-as.numeric(data$cut)
cor.test(num.clarity, num.cut, method = "kendal")
# Chi square (similarity)
# Recall that chi square compares nominal level data: IV and DV
chisq.test(data$cut,data$clarity)
# or...
mytable <-xtabs(~cut+clarity, data=data)
mytable
summary(mytable) #the summary command produces a chi-sq test of independence
# independent 2-group t-test
# t.test(y,x) # where both variables are numeric (i.e. interval)
t.test(diamonds$price, diamonds$carat)
# but you need to pay attention to the measurement of the variables
# independent 2-group t-test
# t.test(y~x) # where y is numeric and x is a binary factor
qplot(x=cut, data=diamonds)
# examine the different group means
# summary statistics by group
# install.packages("psych")
library(psych)
describeBy(diamonds, diamonds$cut)
describeBy(diamonds$price, diamonds$cut)
# create two categories: fair and good - ideal.
# do fair diamonds have a different mean price then other cuts?
diamonds$cutcat <- as.factor(ifelse(diamonds$cut == "Fair", c("0"), c("1")))
# View(diamonds)
summary(diamonds$cutcat)
t.test(diamonds$price ~ diamonds$cutcat)
# create a sample of the diamonds data
sample <- diamonds %>% sample_n(1000)
mean(sample$price) # "sample" mean = 3969.098
mean(diamonds$price) # "population" mean = 3932.8
# how would we test?
se.sample <- sd(sample$price)/sqrt(1000)
z <- (3969.098 - 3932.8)/se.sample
print(z)
pnorm(z) # area to the left
1-pnorm(z) # area to the right
pnorm(z, lower.tail = FALSE) # area to the right
mean(sample$price) # "sample" mean = 1796.6
# how would we test?
se.sample <- sd(sample$price)/sqrt(1000)
z2 <- (1796.6 - 0)/se.sample
print(z2)
pnorm(z2) # area to the left
1-pnorm(z2) # area to the right
pnorm(z2, lower.tail = FALSE) # area to the right
# install.packages("openintro")
require(openintro)
data(gifted)
min(gifted$count) # referencing variables
mean(gifted$count)
median(gifted$count)
sd(gifted$count)
max(gifted$count)
# OR:
summary(gifted$count) # missing standard deviation
sd(gifted$count)
# histogram
hist(gifted$count)
# make it look a little prettier
qplot(x=count, data=gifted, xlab="Age in Months", ylab="Count", main="Distribution of Count")
# Z test
mean.gifted <- mean(gifted$count)
sd <- sd(gifted$count)
# z = obs - mean / standard error
se <- sd / sqrt(36)
z <- (mean.gifted - 32) / se
p <- pnorm(z) # left tail because Ha < 32
print(p)
# What if Ha was Ha: u > 32?
p2 <- 1-p
print(p2)
# t test
t <- (mean.gifted - 32) / se
print(t)
df <- 36 - 1
print(df)
upper <- mean.gifted + (1.65*se)
lower <- mean.gifted - (1.65*se)
ci <- c(lower, upper)
print(ci)
# produce a new variable gifted.iq
diff.iq <- gifted$motheriq - gifted$fatheriq
# summary statistics: diffiq
diff <- summary(diff.iq)
print(diff)
sd.diff <- sd(diff.iq)
print(sd.diff)
hist(diff.iq)
# conduct a paired t test between mother iq and father iq
t.test(gifted$motheriq, gifted$fatheriq, paired=TRUE)
# manually calculate a paired t-test
# H0: mu = 0
# Ha: mu does not equal 0
# T = average difference - observed difference / s.e. difference
# a = 0.05 (set by me)
sd <- sd(diff.iq)
se <- sd/sqrt(36)
T <- (3.4 - 0)/se
print(T)
df <- 36-1
print(df)
# t critical is between 2.042 and 2.021
# t critical is between 2.042 and 2.021
# |t statistic| = 2.73 > t critical
# t critical is between 2.042 and 2.021
# |t statistic| = 2.73 > t critical
# Therefore, I reject the null hypothesis that there is no difference between mothers' and fathers' iqs
##########################################
# Review
# Import data
data <- diamonds %>% sample_n(1000)
# examine our data and variables of interest: carat and price.
summary(data)
summary(data$price)
hist(data$carat)
hist(data$price)
##########################################
# Review
# Import data
diamonds <- diamonds %>% sample_n(1000)
# clear the environment
rm(list=ls())
##########################################
# Review
# Import data
diamonds <- diamonds %>% sample_n(1000)
# examine our data and variables of interest: carat and price.
summary(diamonds)
summary(diamonds$price)
hist(diamonds$carat)
hist(diamonds$price)
# summary statistics
summary(diamonds$carat)
sd(diamonds$carat)
hist(diamonds$carat)
qplot(x=carat, data=diamonds)
summary(diamonds$price)
sd(diamonds$price)
hist(diamonds$price)
qplot(x=price, data=diamonds)
# independent 2-group t-test
# t.test(y,x) # where both variables are numeric (i.e. interval)
t.test(diamonds$price, diamonds$carat)
# but you need to pay attention to the measurement of the variables
# independent 2-group t-test
# t.test(y~x) # where y is numeric and x is a binary factor
qplot(x=cut, data=diamonds)
# mean by groups
# summary statistics by group
# install.packages("psych")
library(psych)
describeBy(diamonds, diamonds$cut)
describeBy(diamonds$price, diamonds$cut)
# create two categories: fair and good - ideal.
# do fair diamonds have a different mean price then other cuts?
diamonds$cutcat <- as.factor(ifelse(diamonds$cut == "Fair", c("0"), c("1")))
# View(diamonds)
summary(diamonds$cutcat)
t.test(diamonds$price ~ diamonds$cutcat)
# Scatterplot
# The command for a scatterplot is plot(INDEPENDENT VARIABLE, DEPENDENT VARIABLE)
plot(diamonds$carat,diamonds$price)
# This is the type of graph you needed to make in RS2.
# if we want to add labels
plot(diamonds$carat,diamonds$price,ylab="Price",
xlab="Carat",main="My Scatterplot!")
# Add a trend line
# OUTCOME ~ PREDICTOR, DATA = DATA NAME
abline(reg = lm(price~carat,data=diamonds),lty=1,lwd=4,
col = "red")
# Correlations
# cor(Y,X)
cor(diamonds$price,diamonds$carat)
# Correlations
# cor(Y,X)
cor(diamonds$price,diamonds$carat)
# Correlation Tests
# Pearson Corellation (similarity)
# recall that Pearson tests are for comparing interval level data: IV and DV
# y,x
cor.test(diamonds$price,diamonds$carat,method = "pearson")
# Kendal Tau (correlation using ordinal data (IV and DV))
# NULL = There is no relationship between the variables.
cor.test(diamonds$clarity,diamonds$cut,method = "kendal")
qplot(x=clarity, data=diamonds)
qplot(x=cut, data=diamonds)
kendal
# Kendal Tau (correlation using ordinal data (IV and DV))
# NULL = There is no relationship between the variables.
cor.test(diamonds$clarity,diamonds$cut,method = "kendal")
qplot(x=clarity, data=diamonds)
qplot(x=cut, data=diamonds)
summary(data$clarity)
num.clarity<-as.numeric(diamonds$clarity)
# tells R to make a variable that has a numeric value
# tells R to treat it like a continuous number
num.cut <-as.numeric(diamonds$cut)
cor.test(num.clarity, num.cut, method = "kendal")
# Chi square
# chi square compares nominal level data: IV and DV
chisq.test(diamonds$cut,diamonds$clarity)
# or...
mytable <-xtabs(~cut+clarity, data=diamonds)
mytable
summary(mytable) #the summary command produces a chi-sq test of independence
# Correlation Tests
# Pearson Corellation (similarity)
# recall that Pearson tests are for comparing interval level data: IV and DV
# y,x
cor.test(diamonds$price,diamonds$carat,method = "pearson")
# Scatterplot
# The command for a scatterplot is plot(INDEPENDENT VARIABLE, DEPENDENT VARIABLE)
plot(data$carat,data$price)
# Scatterplot
# The command for a scatterplot is plot(INDEPENDENT VARIABLE, DEPENDENT VARIABLE)
plot(data$carat,data$price)
# Load in the dataset.
fake.data <-read.csv("fake.data.csv")
# Examine the structure of each data frame
str(fake.data)
# View the Data
View(fake.data)
# Examine the data
summary(fake.data)
mean(fake.data$X1)
summary(fake.data$X1)
summary(fake.data$X2)
summary(fake.data$X3)
summary(fake.data$X4)
summary(fake.data$X5)
qplot(x=X4,data=fake.data)
qplot(x=X4,data=fake.data)
hist(fake.data$X4)
## create a scatterplot comparing X4 and Y
plot(fake.data$X4, fake.data$Y)
## Is the relationship positive or negative? Add a trend line to find out.
abline(reg = lm(Y~X4,data=fake.data),lty=1,lwd=4, col = "purple")
model.2 <- lm(Y ~ X4, data=fake.data)
model.2
summary(model.2)
# Multiple regression
model.3 <- lm(Y ~ X4 + X1, data=fake.data)
summary(model.3)
# In class question: how do we interpret these results?
plot(fake.data$X4, fake.data$Y)
# lm(Dependent Variable ~ Independent Variable, data = DATA SET NAME)
model.1 <- lm(price ~ carat, data=data)
# Call the model. The model output is only so informative - i.e. it doesn't tell us a lot.
model.1
# so use "summary" function which gives us a lot more information
summary(model.1)
